# Loading packages
library(readtext)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidytext)
library(tm)
library(textstem)
library(wordcloud)
library(slam)
library(topicmodels)
library(SentimentAnalysis)
The main objective of this paper is to analyse the Federal Open Market Commitee statements using text mining methods and tools provided by R. We start with basic analysis of (….)
Federal Open Market Committee (FOMC) is the body of the central bank of United States (the Federal Reserve System). Its main duties is setting the national monetary policy. The FOMC holds eight regularly scheduled meetings per year. At these meetings, the Committee reviews economic and financial conditions, determines the appropriate stance of monetary policy, and assesses the risks to its long-run goals of price stability and sustainable economic growth. The FOMC consists of 12 voting members: seven members of the Board of Governors, the president of the Federal Reserve Bank of New York and 4 of the remaining 11 Reserve Bank presidents, who serve one-year terms on a rotating basis. All 12 of the Reserve Bank presidents attend FOMC meetings and participate in FOMC discussions, but only the presidents who are Committee members at the time may vote on policy decisions. FOMC meetings typically are held eight times each year in Washington, D.C., and at other times as needed.
The Committee releases a public statement immediately after each FOMC meeting. Each statement follows very similar structure. Firstly, the general background of the economic situation is presented. Then the Commitee introduces the value of the established federal funds rate and also share predictions. At the end, there are listed names of people which voted for the FOMC monetary policy action.
We sourced the data by scraping the statements from the Federal Reserve official website 1 using Python. In the scraping algorithm we limited the content only to FOMC announcment, omitting the names of voters listed in the last paragraph. The analysed period includes years from 2006 to 2018 which resulted in obtaining 107 documents.
# Loading scrapped statements
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
# "~/Desktop/FOMC-text-mining/Statements"
fomc_2006 <- readtext(paste0(DATA_DIR, "/2006/*"))
fomc_2007 <- readtext(paste0(DATA_DIR, "/2007/*"))
fomc_2008 <- readtext(paste0(DATA_DIR, "/2008/*"))
fomc_2009 <- readtext(paste0(DATA_DIR, "/2009/*"))
fomc_2010 <- readtext(paste0(DATA_DIR, "/2010/*"))
fomc_2011 <- readtext(paste0(DATA_DIR, "/2011/*"))
fomc_2012 <- readtext(paste0(DATA_DIR, "/2012/*"))
fomc_2013 <- readtext(paste0(DATA_DIR, "/2013/*"))
fomc_2014 <- readtext(paste0(DATA_DIR, "/2014/*"))
fomc_2015 <- readtext(paste0(DATA_DIR, "/2015/*"))
fomc_2016 <- readtext(paste0(DATA_DIR, "/2016/*"))
fomc_2017 <- readtext(paste0(DATA_DIR, "/2017/*"))
fomc_2018 <- readtext(paste0(DATA_DIR, "/2018/*"))
# Binding data
statements <- rbind(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Removing files from memory
remove(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
We start our work on statments with the initial preprocessing of the dataset. It consists of two columns: doc_id and text. Doc_id is sourced from each statement’s website link. Text is just a content of the statement.
head(statements, 1)
## readtext object consisting of 1 document and 0 docvars.
## # Description: df[,2] [1 x 2]
## doc_id text
## * <chr> <chr>
## 1 20060131.txt "\"The Federa\"..."
# adding an unique ID
statements <- statements %>% mutate(ID = 1:n())
# setting column names
colnames(statements) <- c("Date", "Text", "ID")
# modification of doc_id column - changing it to date column
statements$Date <- gsub(".txt", "", statements$Date)
statements$Date <- as.Date(statements$Date, "%Y%m%d ")
statements_all <- as.vector(statements$Text)
length(statements_all)
## [1] 107
The next step was concerting the dataset into volatile corpora which is a handful form in the following operations. Below can be seen an example statement before any text preprocessing operations applied.
(corpus_all <- VCorpus(VectorSource(statements_all)))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 107
inspect(corpus_all[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 778
##
## The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives.
We start preprocessing with text cleaning using tm_map() function. We lower each case, remove words from the built-in stopwords list, we remove punctuation, unnecessary whitespaces and numbers. At the end we apply PlainTextDocument() function.
corpus_clean <- corpus_all %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(removeNumbers) %>%
tm_map(PlainTextDocument)
Below can be seen examples of the statements after above cleaning steps.
as.character(corpus_clean[[1]])
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
# example statement after cleaning
as.character(corpus_clean[[2]])
## [1] " federal open market committee decided today raise target federal funds rate basis points percent slowing growth real gdp fourth quarter seems largely reflected temporary special factors economic growth rebounded strongly current quarter appears likely moderate sustainable pace yet run prices energy commodities appears modest effect core inflation ongoing productivity gains helped hold growth unit labor costs check inflation expectations remain contained still possible increases resource utilization combination elevated prices energy commodities potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives"
In order to ease operations on the corpus, we modify it into a data frame.
df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
df_corpus$text[1]
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
In the next steps, we append statements data frame with cleaned text. We also count number of words occuring in the original statement and in the cleaned statement.
statements_clean <- statements %>%
mutate(cleaned_text = df_corpus$text)
# clened_text
count_cleaned_word <- statements_clean %>%
unnest_tokens(word_count, cleaned_text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize(word_cleaned_count = sum(n))
statements_clean_count <- left_join(statements_clean, count_cleaned_word, by = 'ID')
count_word <- statements_clean_count %>%
unnest_tokens(word_count, Text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize (word_count = sum(n))
statements_final <- left_join(statements_clean_count, count_word, by = 'ID')
(…)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(viridis)
## Loading required package: viridisLite
myplot <- statements_final %>%
select(Date, word_count, word_cleaned_count) %>%
ggplot() +
geom_line(aes(x = Date,
y = word_count),
color = viridis(10)[3]) +
geom_line(aes(x = Date,
y = word_cleaned_count),
color = viridis(10)[6]) +
labs(x = "Date",
y = "Number of words",
title = "Comparison of number of words between original and cleaned <br>statements content over time") +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
theme_minimal()
ggplotly(myplot)
(…)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
statements_words <- statements_clean_count %>%
mutate(year = year(Date)) %>%
unnest_tokens(word_count, cleaned_text) %>%
count(year, word_count, sort = T)
statements_words
## # A tibble: 4,009 x 3
## year word_count n
## <dbl> <chr> <int>
## 1 2014 committee 139
## 2 2013 committee 127
## 3 2014 inflation 114
## 4 2015 committee 112
## 5 2017 inflation 104
## 6 2015 inflation 96
## 7 2011 committee 89
## 8 2013 will 89
## 9 2016 inflation 89
## 10 2012 committee 87
## # ... with 3,999 more rows
statements_words <- statements_words %>%
bind_tf_idf(word_count, year, n)
statements_words
## # A tibble: 4,009 x 6
## year word_count n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 2014 committee 139 0.0384 0 0
## 2 2013 committee 127 0.0399 0 0
## 3 2014 inflation 114 0.0315 0 0
## 4 2015 committee 112 0.0419 0 0
## 5 2017 inflation 104 0.0440 0 0
## 6 2015 inflation 96 0.0359 0 0
## 7 2011 committee 89 0.0429 0 0
## 8 2013 will 89 0.0280 0 0
## 9 2016 inflation 89 0.0353 0 0
## 10 2012 committee 87 0.0416 0 0
## # ... with 3,999 more rows
pd = statements_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word_count, levels = rev(unique(word_count)))) %>%
group_by(year) %>%
top_n(10) %>%
ungroup() %>%
arrange(year, tf_idf) %>%
mutate(order = row_number())
## Selecting by word
ggplot(pd, aes(order, tf_idf, fill = year)) +
geom_bar(show.legend = FALSE, stat = "identity") +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 3, scales = "free") +
scale_x_continuous(breaks = pd$order,
labels = pd$word,
expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
coord_flip() +
theme_minimal()
library(wordcloud)
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v), freq=v)
head(d, 10)
## word freq
## committee committee 915
## inflation inflation 834
## will will 638
## economic economic 556
## market market 446
## federal federal 445
## rate rate 418
## labor labor 331
## conditions conditions 326
## securities securities 314
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=viridis(10))
# Lemmatization
statements_final$lemma_text <- lemmatize_strings(statements_final$cleaned_text)
# Tokenization
tokens <- statements_final %>%
unnest_tokens(word, lemma_text)
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
library(viridis)
dupa <- ggplot(statements_topics, aes(x=Date)) +
geom_line(aes(y=topic1), fill =viridis(10)[2]) +
geom_line(aes(y=topic2), color=viridis(10)[5]) +
geom_line(aes(y=topic3), color=viridis(10)[10]) +
geom_line(aes(y=topic4))
## Warning: Ignoring unknown parameters: fill
ggplotly(dupa)
ggplot(statements_topics) +
geom_density( aes(x = topic1, y = ..scaled..), fill = "red", alpha = 0.5) +
geom_density( aes(x = topic2, y = ..scaled..), fill = "green", alpha = 0.5) +
geom_density( aes(x = topic3, y = ..scaled..), fill = "blue", alpha = 0.5) +
geom_density( aes(x = topic4, y = ..scaled..), fill = "yellow", alpha = 0.5)